{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/usr/local/lib/python3.5/dist-packages/h5py/__init__.py:36: FutureWarning: Conversion of the second argument of issubdtype from `float` to `np.floating` is deprecated. In future, it will be treated as `np.float64 == np.dtype(float).type`.\n",
      "  from ._conv import register_converters as _register_converters\n"
     ]
    }
   ],
   "source": [
    "import tensorflow as tf\n",
    "import numpy as np\n",
    "from sklearn.metrics import classification_report"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "word2idx = {'<pad>': 0}\n",
    "tag2idx = {'<pad>': 0}\n",
    "word_idx = 1\n",
    "tag_idx = 1\n",
    "x_train = []\n",
    "y_train = []\n",
    "x_test = []\n",
    "y_test = []\n",
    "\n",
    "for line in open('pos_train.txt'):\n",
    "    line = line.rstrip()\n",
    "    if line:\n",
    "        word, tag, _ = line.split()\n",
    "        if word not in word2idx:\n",
    "            word2idx[word] = word_idx\n",
    "            word_idx += 1\n",
    "        x_train.append(word2idx[word])\n",
    "        if tag not in tag2idx:\n",
    "            tag2idx[tag] = tag_idx\n",
    "            tag_idx += 1\n",
    "        y_train.append(tag2idx[tag])\n",
    "        \n",
    "word2idx['<unknown>'] = word_idx\n",
    "\n",
    "for line in open('pos_test.txt'):\n",
    "    line = line.rstrip()\n",
    "    if line:\n",
    "        word, tag, _ = line.split()\n",
    "        if word in word2idx:\n",
    "            x_test.append(word2idx[word])\n",
    "        else:\n",
    "            x_test.append(word_idx)\n",
    "        y_test.append(tag2idx[tag])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "params = {\n",
    "    'seq_len': 20,\n",
    "    'batch_size': 128,\n",
    "    'hidden_dim': 128,\n",
    "    'clip_norm': 5.0,\n",
    "    'text_iter_step': 1,\n",
    "    'lr': {'start': 5e-3, 'end': 5e-4},\n",
    "    'n_epoch': 2,\n",
    "    'num_layers':2,\n",
    "    'display_step': 50,\n",
    "    'vocab_size': len(word2idx),\n",
    "    'n_class':tag_idx\n",
    "}"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
    "def iter_seq(x):\n",
    "    return np.array([x[i: i+params['seq_len']] for i in range(0, len(x)-params['seq_len'], params['text_iter_step'])])\n",
    "\n",
    "def to_train_seq(*args):\n",
    "    return [iter_seq(x) for x in args]\n",
    "\n",
    "def to_test_seq(*args):\n",
    "    return [np.reshape(x[:(len(x)-len(x)%params['seq_len'])],\n",
    "        [-1,params['seq_len']]) for x in args]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [],
   "source": [
    "X_train, Y_train = to_train_seq(x_train, y_train)\n",
    "X_test, Y_test = to_test_seq(x_test, y_test)\n",
    "params['lr']['steps'] = len(X_train) // params['batch_size']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(211707, 20)"
      ]
     },
     "execution_count": 6,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "Y_train.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [],
   "source": [
    "class Model:\n",
    "    def __init__(self):\n",
    "        \n",
    "        def clip_grads(loss):\n",
    "            variables = tf.trainable_variables()\n",
    "            grads = tf.gradients(loss, variables)\n",
    "            clipped_grads, _ = tf.clip_by_global_norm(grads, params['clip_norm'])\n",
    "            return zip(clipped_grads, variables)\n",
    "        \n",
    "        def cells(reuse=False):\n",
    "            return tf.nn.rnn_cell.LSTMCell(params['hidden_dim'],initializer=tf.orthogonal_initializer(),\n",
    "                                           reuse=reuse)\n",
    "        \n",
    "        self.X = tf.placeholder(tf.int32, [None, params['seq_len']])\n",
    "        self.Y = tf.placeholder(tf.int32, [None, params['seq_len']])\n",
    "        self.X_seq_len = tf.placeholder(tf.int32, [None])\n",
    "        self.Y_seq_len = tf.placeholder(tf.int32, [None])\n",
    "        self.batch_size = tf.placeholder(tf.int32, None)\n",
    "        \n",
    "        batch_size = self.batch_size\n",
    "        encoder_embeddings = tf.Variable(tf.random_uniform([params['vocab_size'], \n",
    "                                                            params['hidden_dim']], -1, 1))\n",
    "        decoder_embeddings = tf.Variable(tf.random_uniform([params['n_class'], params['hidden_dim']], -1, 1))\n",
    "        encoder_embedded = tf.nn.embedding_lookup(encoder_embeddings, self.X)\n",
    "        main = tf.strided_slice(self.X, [0, 0], [batch_size, -1], [1, 1])\n",
    "        decoder_input = tf.concat([tf.fill([batch_size, 1], 0), main], 1)\n",
    "        decoder_embedded = tf.nn.embedding_lookup(encoder_embeddings, decoder_input)\n",
    "        rnn_cells = tf.nn.rnn_cell.MultiRNNCell([cells() for _ in range(params['num_layers'])])\n",
    "        _, last_state = tf.nn.dynamic_rnn(rnn_cells, encoder_embedded,\n",
    "                                          dtype = tf.float32)\n",
    "        with tf.variable_scope(\"decoder\"):\n",
    "            rnn_cells_dec = tf.nn.rnn_cell.MultiRNNCell([cells() for _ in range(params['num_layers'])])\n",
    "            outputs, _ = tf.nn.dynamic_rnn(rnn_cells_dec, decoder_embedded, \n",
    "                                           initial_state = last_state,\n",
    "                                           dtype = tf.float32)\n",
    "        self.logits = tf.layers.dense(outputs,params['n_class'])\n",
    "        masks = tf.sequence_mask(self.Y_seq_len, tf.reduce_max(self.Y_seq_len), dtype=tf.float32)\n",
    "        self.cost = tf.contrib.seq2seq.sequence_loss(logits = self.logits,\n",
    "                                                     targets = self.Y,\n",
    "                                                     weights = masks)\n",
    "        \n",
    "        self.global_step = tf.Variable(0, trainable=False)\n",
    "        self.learning_rate = tf.train.exponential_decay(params['lr']['start'],\n",
    "                                                        self.global_step, params['lr']['steps'],\n",
    "                                                        params['lr']['end']/params['lr']['start'])\n",
    "        self.optimizer = tf.train.AdamOptimizer(self.learning_rate).apply_gradients(clip_grads(self.cost), \n",
    "                                                                                    global_step=self.global_step)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [],
   "source": [
    "tf.reset_default_graph()\n",
    "sess = tf.InteractiveSession()\n",
    "model = Model()\n",
    "sess.run(tf.global_variables_initializer())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [],
   "source": [
    "def check_accuracy(logits, Y):\n",
    "    acc = 0\n",
    "    for i in range(logits.shape[0]):\n",
    "        internal_acc = 0\n",
    "        for k in range(len(Y[i])):\n",
    "            if Y[i][k] == logits[i][k]:\n",
    "                internal_acc += 1\n",
    "        acc += (internal_acc / len(Y[i]))\n",
    "    return acc / logits.shape[0]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "epoch 1, step 1, loss 3.804602, acc 0.023828\n",
      "epoch 1, step 50, loss 2.641443, acc 0.269141\n",
      "epoch 1, step 100, loss 2.538152, acc 0.269531\n",
      "epoch 1, step 150, loss 2.517906, acc 0.257031\n",
      "epoch 1, step 200, loss 2.560117, acc 0.278125\n",
      "epoch 1, step 250, loss 2.248977, acc 0.331250\n",
      "epoch 1, step 300, loss 2.299693, acc 0.306250\n",
      "epoch 1, step 350, loss 2.440006, acc 0.308203\n",
      "epoch 1, step 400, loss 2.179614, acc 0.334766\n",
      "epoch 1, step 450, loss 2.176687, acc 0.348828\n",
      "epoch 1, step 500, loss 2.124507, acc 0.403906\n",
      "epoch 1, step 550, loss 2.072956, acc 0.399609\n",
      "epoch 1, step 600, loss 2.207045, acc 0.350781\n",
      "epoch 1, step 650, loss 2.013363, acc 0.384375\n",
      "epoch 1, step 700, loss 1.799941, acc 0.428516\n",
      "epoch 1, step 750, loss 1.746237, acc 0.464063\n",
      "epoch 1, step 800, loss 2.087084, acc 0.380078\n",
      "epoch 1, step 850, loss 1.930929, acc 0.469531\n",
      "epoch 1, step 900, loss 2.072091, acc 0.439062\n",
      "epoch 1, step 950, loss 1.820239, acc 0.447656\n",
      "epoch 1, step 1000, loss 1.131939, acc 0.616797\n",
      "epoch 1, step 1050, loss 1.664668, acc 0.471875\n",
      "epoch 1, step 1100, loss 1.794178, acc 0.465625\n",
      "epoch 1, step 1150, loss 1.809082, acc 0.483984\n",
      "epoch 1, step 1200, loss 1.743840, acc 0.499219\n",
      "epoch 1, step 1250, loss 1.616392, acc 0.488281\n",
      "epoch 1, step 1300, loss 1.899947, acc 0.429687\n",
      "epoch 1, step 1350, loss 1.271118, acc 0.602734\n",
      "epoch 1, step 1400, loss 1.602868, acc 0.508203\n",
      "epoch 1, step 1450, loss 1.616175, acc 0.515625\n",
      "epoch 1, step 1500, loss 1.605641, acc 0.500391\n",
      "epoch 1, step 1550, loss 1.792672, acc 0.505078\n",
      "epoch 1, step 1600, loss 1.348211, acc 0.597266\n",
      "epoch 1, step 1650, loss 1.169919, acc 0.660937\n",
      "epoch 1, avg loss 2.027726, avg acc 0.410647\n",
      "epoch 2, step 1700, loss 1.703829, acc 0.559766\n",
      "epoch 2, step 1750, loss 1.394329, acc 0.585156\n",
      "epoch 2, step 1800, loss 1.534068, acc 0.542578\n",
      "epoch 2, step 1850, loss 1.634587, acc 0.519922\n",
      "epoch 2, step 1900, loss 1.635818, acc 0.529297\n",
      "epoch 2, step 1950, loss 1.723435, acc 0.502734\n",
      "epoch 2, step 2000, loss 1.768894, acc 0.458984\n",
      "epoch 2, step 2050, loss 1.629223, acc 0.516797\n",
      "epoch 2, step 2100, loss 1.415045, acc 0.583203\n",
      "epoch 2, step 2150, loss 1.468521, acc 0.593359\n",
      "epoch 2, step 2200, loss 1.834504, acc 0.482422\n",
      "epoch 2, step 2250, loss 1.309275, acc 0.617187\n",
      "epoch 2, step 2300, loss 1.437447, acc 0.544531\n",
      "epoch 2, step 2350, loss 1.335738, acc 0.552344\n",
      "epoch 2, step 2400, loss 0.895011, acc 0.752344\n",
      "epoch 2, step 2450, loss 1.480136, acc 0.557031\n",
      "epoch 2, step 2500, loss 1.651417, acc 0.546484\n",
      "epoch 2, step 2550, loss 1.636562, acc 0.543359\n",
      "epoch 2, step 2600, loss 1.293534, acc 0.560937\n",
      "epoch 2, step 2650, loss 1.243408, acc 0.603906\n",
      "epoch 2, step 2700, loss 1.302576, acc 0.632422\n",
      "epoch 2, step 2750, loss 1.417959, acc 0.603906\n",
      "epoch 2, step 2800, loss 1.346003, acc 0.620313\n",
      "epoch 2, step 2850, loss 1.315368, acc 0.617187\n",
      "epoch 2, step 2900, loss 1.460338, acc 0.573047\n",
      "epoch 2, step 2950, loss 1.497881, acc 0.573437\n",
      "epoch 2, step 3000, loss 1.082688, acc 0.698828\n",
      "epoch 2, step 3050, loss 1.734208, acc 0.501562\n",
      "epoch 2, step 3100, loss 1.565268, acc 0.528516\n",
      "epoch 2, step 3150, loss 1.503111, acc 0.552344\n",
      "epoch 2, step 3200, loss 1.544415, acc 0.529297\n",
      "epoch 2, step 3250, loss 1.493284, acc 0.540625\n",
      "epoch 2, step 3300, loss 1.456298, acc 0.577344\n",
      "epoch 2, avg loss 1.470546, avg acc 0.566526\n"
     ]
    }
   ],
   "source": [
    "for i in range(params['n_epoch']):\n",
    "    total_cost,total_accuracy = 0, 0\n",
    "    for k in range(0,(X_train.shape[0] // params['batch_size'])*params['batch_size'],params['batch_size']):\n",
    "        batch_x = X_train[k:k+params['batch_size']]\n",
    "        batch_y = Y_train[k:k+params['batch_size']]\n",
    "        batch_x_seq = [params['seq_len']]*params['batch_size']\n",
    "        batch_y_seq = [params['seq_len']]*params['batch_size']\n",
    "        predicted, step, loss, _ = sess.run([tf.argmax(model.logits,2), model.global_step, \n",
    "                                             model.cost, model.optimizer],\n",
    "                                            feed_dict={model.X:batch_x, model.Y:batch_y,\n",
    "                                            model.X_seq_len:batch_x_seq,\n",
    "                                            model.Y_seq_len:batch_y_seq,\n",
    "                                            model.batch_size:params['batch_size']})\n",
    "        acc = check_accuracy(predicted,batch_y)\n",
    "        if step % params['display_step'] == 0 or step == 1:\n",
    "            print('epoch %d, step %d, loss %f, acc %f'%(i+1,step,loss, acc))\n",
    "        total_cost += loss\n",
    "        total_accuracy += acc\n",
    "    total_cost /= ((X_train.shape[0] // params['batch_size']))\n",
    "    total_accuracy /= ((X_train.shape[0] // params['batch_size']))\n",
    "    print('epoch %d, avg loss %f, avg acc %f'%(i+1,total_cost,total_accuracy))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "             precision    recall  f1-score   support\n",
      "\n",
      "          ,       0.45      0.65      0.53      6639\n",
      "        VBD       0.62      0.79      0.69      5070\n",
      "         VB       0.66      0.85      0.74      4020\n",
      "        RBR       0.36      0.24      0.28       912\n",
      "         EX       0.32      0.12      0.18      1354\n",
      "        WP$       0.53      0.45      0.49      1103\n",
      "        VBP       0.83      0.87      0.85      1177\n",
      "          #       0.74      0.81      0.77      1269\n",
      "          )       0.34      0.12      0.18      2962\n",
      "        POS       0.41      0.24      0.31      3034\n",
      "         TO       0.47      0.61      0.53      4803\n",
      "        JJR       0.77      0.93      0.84      2389\n",
      "      <pad>       0.47      0.40      0.43      1214\n",
      "         CC       0.59      0.36      0.44       433\n",
      "          :       0.89      0.92      0.91      1974\n",
      "         IN       0.48      0.37      0.42       539\n",
      "         CD       0.26      0.03      0.05       727\n",
      "        SYM       0.77      0.08      0.15       421\n",
      "         NN       0.64      0.55      0.59      1918\n",
      "         WP       0.49      0.21      0.30       323\n",
      "        JJS       0.60      0.55      0.58       316\n",
      "        NNS       0.35      0.45      0.39      1679\n",
      "          $       0.00      0.00      0.00        48\n",
      "        NNP       0.43      0.22      0.29       470\n",
      "          (       0.00      0.00      0.00        11\n",
      "         MD       0.00      0.00      0.00        77\n",
      "         FW       0.61      0.42      0.50       384\n",
      "         ''       0.00      0.00      0.00        77\n",
      "       NNPS       0.00      0.00      0.00       130\n",
      "         RB       0.48      0.40      0.44       814\n",
      "        WDT       0.00      0.00      0.00        77\n",
      "       PRP$       0.00      0.00      0.00       110\n",
      "         JJ       0.84      0.23      0.36        70\n",
      "        PDT       0.00      0.00      0.00       202\n",
      "        RBS       0.38      0.06      0.10       202\n",
      "        VBN       1.00      0.01      0.02        93\n",
      "         DT       0.00      0.00      0.00        49\n",
      "        VBZ       0.00      0.00      0.00        10\n",
      "         ``       0.00      0.00      0.00        12\n",
      "        WRB       0.42      0.04      0.08       238\n",
      "         RP       0.00      0.00      0.00         4\n",
      "        PRP       0.00      0.00      0.00         4\n",
      "         UH       0.00      0.00      0.00         2\n",
      "\n",
      "avg / total       0.53      0.56      0.52     47360\n",
      "\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/usr/local/lib/python3.5/dist-packages/sklearn/metrics/classification.py:1428: UserWarning: labels size, 43, does not match size of target_names, 45\n",
      "  .format(len(labels), len(target_names))\n",
      "/usr/local/lib/python3.5/dist-packages/sklearn/metrics/classification.py:1135: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples.\n",
      "  'precision', 'predicted', average, warn_for)\n"
     ]
    }
   ],
   "source": [
    "Y_pred = sess.run(tf.argmax(model.logits,2),feed_dict={model.X:X_test,model.batch_size:X_test.shape[0],\n",
    "                                model.X_seq_len:[params['seq_len']]*X_test.shape[0]})\n",
    "print(classification_report(Y_test.ravel(), Y_pred.ravel(), target_names=tag2idx.keys()))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.5.2"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
